import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import cufflinks as cf
cf.go_offline()
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
#Load the excel files.
app_train = pd.read_csv('application_train.csv')
app_test = pd.read_csv('application_test.csv')
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
credit_card = pd.read_csv('credit_card_balance.csv')
installments = pd.read_csv('installments_payments.csv')
pos_cash = pd.read_csv('POS_CASH_balance.csv')
previous_app = pd.read_csv('previous_application.csv')
#Let's take a look at the initial columns in the train data
app_train.head()
#Check the shape of the dataset
print('the shape of the train data is:',app_train.shape)
print('the shape of the test data is:',app_test.shape)
print('the shape of the bureau data is:',bureau.shape)
print('the shape of the bureau_balance data is:',bureau_balance.shape)
print('the shape of the credit card data is:',credit_card.shape)
print('the shape of the installments data is:',installments.shape)
print('the shape of the pos cash data is:',pos_cash.shape)
print('the shape of the previous_application data is:',previous_app.shape)
#Lets check the missing values and percentage of missing values
def missing_values(df):
miss_values = df.isnull().sum()
per_miss_values = 100*df.isnull().sum()/len(df)
#Create a table to show these 2 values
table = pd.concat([miss_values,per_miss_values],axis=1)
table = table.rename(columns = {0:'Missing Values', 1:'% of Total values'})
table = table[table.iloc[:,1]!=0].sort_values('% of Total values', ascending = False)
return table
missing_train = missing_values(app_train)
missing_test = missing_values(app_test)
missing_bureau = missing_values(bureau)
missing_bureau_balance = missing_values(bureau_balance)
missing_creditcard = missing_values(credit_card)
missing_installments = missing_values(installments)
missing_poscash = missing_values(pos_cash)
missing_previous_app = missing_values(previous_app)
#Lets get on with exploring the data. We will start off with a few attributes from the train data and gradually include attributes
#from other datasets
##Lets check the distribution of the target.
count = app_train['TARGET'].value_counts()
fr = pd.DataFrame({'labels':count.index, 'values':count.values})
fr.iplot(kind='pie', labels = 'labels', values = 'values', title = 'Distribution of the target variable')
#Its a very imbalanced problem. ~92% of the people have repaid their loans on time.
#Separate the people whove repaid loans from the people who have not..
app_train['TARGET'].dtypes
repaid = app_train[app_train['TARGET']==0]
non_repaid = app_train[app_train['TARGET']==1]
#Let's check the trend of the time and date of loan application.
count0 = repaid['WEEKDAY_APPR_PROCESS_START'].value_counts()
count1 = non_repaid['WEEKDAY_APPR_PROCESS_START'].value_counts()
data1 = go.Bar(x=count0.index, y=count0.values, name='Loan repayers')
data2 = go.Bar(x=count1.index, y=count1.values, name='Loan non-repayers')
data = [data1,data2]
layout = go.Layout(barmode = 'group')
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='barplot')
#-----------------------------------------------------------------------------
count2 = repaid['HOUR_APPR_PROCESS_START'].value_counts().sort_index()
count3 = non_repaid['HOUR_APPR_PROCESS_START'].value_counts().sort_index()
trace0 = go.Scatter(x=count2.index, y=count2.values, mode='lines+markers', name='Loan repayers')
trace1 = go.Scatter(x=count3.index, y=count3.values, mode='lines+markers', name='Loan non repayers')
trace = [trace0,trace1]
py.iplot(trace, filename='Time distribution')
#There seems to be no significant relationship/insights between the target variable and the time and date of applications filed.
#Let's get on with the analysis of loans.
count = app_train['NAME_CONTRACT_TYPE'].value_counts()/len(app_train)
fig1 = [go.Bar(x=count.index, y=count.values, name='Types of loans distribution')]
layout1 = go.Layout(title = 'Types of loans distribution')
fig_1 = go.Figure(data=fig1, layout=layout1)
py.iplot(fig_1, filename='simple bar')
#-----------------------------------------------------------------------------------------------------------
#Almost 90% of the loans are cash loans. Now within each type of loan what are the percentages of loan repayers and non-repayers.
count = app_train['NAME_CONTRACT_TYPE'].value_counts()
count_0 =[]
count_1 =[]
for i in count.index:
count_0.append(np.sum(app_train['TARGET'][app_train['NAME_CONTRACT_TYPE']==i]==0))
count_1.append(np.sum(app_train['TARGET'][app_train['NAME_CONTRACT_TYPE']==i]==1))
trace1 = go.Bar(x=count.index, y=count_0/count.sum(), name = '0')
trace2 = go.Bar(x=count.index, y=count_1/count.sum(), name = '1')
data = [trace1,trace2]
layout = go.Layout(title = 'Loan types distribution in terms of repayment')
figure = go.Figure(data=data, layout=layout)
py.iplot(figure, filename='loan plot')
#Let's consider the gender of the applicants. The gender breakdown and the relationship of each gender with loan payment/non payment
gender = app_train['CODE_GENDER'].value_counts()
gd = pd.DataFrame({'labels':gender.index, 'values':gender.values})
gd.iplot(kind='pie', labels='labels', values='values', title = 'distribution of the gender variable')
#--------------------------------------------------------------------------------------------------
male = app_train[app_train['CODE_GENDER']=='M']['TARGET'].value_counts()
female = app_train[app_train['CODE_GENDER']=='F']['TARGET'].value_counts()
fig = {'data':[{'labels':male.index, 'values':male.values, 'type':'pie', 'hoverinfo':'label+percent+name', 'text':'Male', 'hole':0.6, 'domain':{'x':[0,0.48]}},
{'labels':female.index, 'values':female.values, 'type':'pie', 'hoverinfo':'label+percent+name', 'text':'Female', 'hole':0.6, 'domain':{'x':[0.52,1]}}
],
'layout': {'title': 'Gender Breakdown in terms of loans applied',
'showlegend': True,
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Male",
"x": 0.19,
"y": 0.50
},
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Female",
"x": 0.81,
"y": 0.50
}
]
}
}
py.iplot(fig, filename='gender donut')
#-----------------------------------------------------------------------------------
#Gender distribution in terms of loans(cash vs revolving loans)
temp1 = app_train[app_train['NAME_CONTRACT_TYPE']=='Cash loans']['CODE_GENDER'].value_counts()
temp2 = app_train[app_train['NAME_CONTRACT_TYPE']=='Revolving loans']['CODE_GENDER'].value_counts()
fig = {'data':[{'labels':temp1.index, 'values':temp1.values, 'type':'pie', 'hole':0.6, 'domain':{'x':[0,0.48]}, 'hoverinfo':'label+percent+name'},
{'labels':temp2.index, 'values':temp2.values, 'type':'pie', 'hole':0.6, 'domain':{'x':[0.52,1]}, 'hoverinfo':'label+percent+name'}
],
'layout': {'title': 'Gender Distribution in terms of types of loan applied',
'showlegend': True,
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Cash Loans",
"x": 0.16,
"y": 0.50
},
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Revolving Loans",
"x": 0.86,
"y": 0.50
}
]
}
}
py.iplot(fig, filename='loan gender plot')
#Analyze the client attributes. Whether the client has a vehicle, house or/children
vehicle = app_train['FLAG_OWN_CAR'].value_counts()
house = app_train['FLAG_OWN_REALTY'].value_counts()
child = app_train['CNT_CHILDREN'].value_counts()
fig = {
'data': [
{
'labels': vehicle.index,
'values': vehicle.values,
'type': 'pie',
'name': 'Vehicle',
'domain': {'x': [0, .48],
'y': [.52, 1]},
'hoverinfo':'label+percent+name',
'text':'Vehicle Owned',
'textposition':'inside',
'hole':0.5
},
{
'labels': house.index,
'values': house.values,
'type': 'pie',
'name': 'House',
'domain': {'x': [.52, 1],
'y': [.52, 1]},
'hoverinfo':'label+percent+name',
'text':'Vehicle Owned',
'textposition':'inside',
'hole':0.5
},
{
'labels': child.index,
'values': child.values,
'type': 'pie',
'name': 'Children',
'domain': {'x': [.33, .67],
'y': [0, .48]},
'hoverinfo':'label+percent+name',
'text':'Vehicle Owned',
'textposition':'inside',
'hole':0.5
}
],
'layout': {'title': 'Breakdown of Loan Applicants',
'showlegend': False,
"annotations": [
{
"font": {
"size": 10
},
"showarrow": False,
"text": "VehicleOwned",
"x": 0.19,
"y": 0.77
},
{
"font": {
"size": 10
},
"showarrow": False,
"text": "HouseOwners",
"x": 0.81,
"y": 0.78
},
{
"font": {
"size": 10
},
"showarrow": False,
"text": "Children",
"x": 0.50,
"y": 0.23
}
]
}
}
py.iplot(fig, filename='donut')
#Do ownership of vehicles and houses affect the repayment of the loan?Let's check it out
vehicle = app_train['FLAG_OWN_CAR'].value_counts()
count_0 = []
count_1 = []
for i in vehicle.index:
count_0.append(np.sum(app_train['TARGET'][app_train['FLAG_OWN_CAR']==i]==0))
count_1.append(np.sum(app_train['TARGET'][app_train['FLAG_OWN_CAR']==i]==1))
trace1 = go.Bar(x=vehicle.index, y=count_0/vehicle.sum(), name='0')
trace2 = go.Bar(x=vehicle.index, y=count_1/vehicle.sum(), name='1')
data = [trace1,trace2]
layout = go.Layout(title = 'Vehicle Owner distribution in terms of repayment')
figure = go.Figure(data=data, layout=layout)
py.iplot(figure, filename='vehicle plot')
#-------------------------------------------------------------------------------
house = app_train['FLAG_OWN_REALTY'].value_counts()
count_0 = []
count_1 = []
for i in house.index:
count_0.append(np.sum(app_train['TARGET'][app_train['FLAG_OWN_REALTY']==i]==0))
count_1.append(np.sum(app_train['TARGET'][app_train['FLAG_OWN_REALTY']==i]==1))
trace1 = go.Bar(x=house.index, y=count_0/house.sum(), name='0')
trace2 = go.Bar(x=house.index, y=count_1/house.sum(), name='1')
trace=[trace1,trace2]
layout = go.Layout(title = 'Loan repayment probability among house owners')
fig = go.Figure(data=trace, layout=layout)
py.iplot(fig, filename='House loans')
#How does number of children affect the loan repayment?
ch = app_train['CNT_CHILDREN'].value_counts().sort_index()
df = pd.DataFrame({'Children':ch.index, 'Count':ch.values})
df.iplot(kind='pie', labels='Children', values='Count', title = 'Children distribution among loan applicants')
#child = pd.DataFrame(app_train['CNT_CHILDREN'].value_counts().sort_index())
#child1 = child.reset_index()
#child1.columns = ['Children','Count']
#child1
count0 = []
count1 = []
for i in ch.index:
count0.append(np.sum(app_train['TARGET'][app_train['CNT_CHILDREN']==i]==0))
count1.append(np.sum(app_train['TARGET'][app_train['CNT_CHILDREN']==i]==1))
trace1 = go.Bar(x=ch.index, y=count0/ch.sum()*100, name='0')
trace2 = go.Bar(x=ch.index, y=count1/ch.sum()*100, name='1')
trace = [trace1, trace2]
layout = go.Layout(title = 'Number of children vs loan repayment')
fig = go.Figure(data=trace, layout=layout)
py.iplot(fig, filename='Child plots')
count0 =[]
count1 =[]
for i in ch.index:
count0.append(np.sum(app_train['TARGET'][app_train['CNT_CHILDREN']==i]==0))
count1.append(np.sum(app_train['TARGET'][app_train['CNT_CHILDREN']==i]==1))
child_data = pd.DataFrame({'Children':ch.index, 'Count':ch.values, 'Count_0':count0, 'Count_1':count1})
child_data
income = [f for f in app_train['AMT_INCOME_TOTAL']]
credit = [i for i in app_train['AMT_CREDIT']]
annuity = [j for j in app_train['AMT_ANNUITY']]
goods = [k for k in app_train['AMT_GOODS_PRICE']]
trace1 = go.Histogram(x=credit, opacity = 0.5)
trace_1 = [trace1]
lay_1 = go.Layout(title = 'Distribution of Credit of loan applicants', xaxis=dict(title='Credit'),yaxis=dict(title='No.of Applicants'))
figu_1 = go.Figure(data=trace_1, layout = lay_1)
py.iplot(figu_1, filename = 'continuous plots')
#---------------------------------------------------------------------------
trace2 = go.Histogram(x=annuity, opacity = 0.5)
trace_2 = [trace2]
lay_2 = go.Layout(title = 'Distribution of Annuity of loan applicants', xaxis=dict(title='Annuity'),yaxis=dict(title='No.of Applicants'))
figu_2 = go.Figure(data=trace_2, layout = lay_2)
py.iplot(figu_2, filename = 'continuous plots')
#---------------------------------------------------------------------------
trace3 = go.Histogram(x=goods, opacity = 0.5, xbins=dict(size = 1))
trace_3 = [trace3]
lay_3 = go.Layout(title = 'Distribution of Goods Amount of loan applicants', xaxis=dict(title='Goods'),yaxis=dict(title='No.of Applicants'))
figu_3 = go.Figure(data=trace_3, layout = lay_3)
py.iplot(figu_3, filename = 'continuous plots')
#Another way to plot these continuous vaariables would be to create bins of suitable sizes.
#This can be done by finding the 25th, 50th and 75th percentile of the variables.
quar = app_train['AMT_CREDIT'].quantile([0.25, 0.5, 0.75, 1])
quar
#Who was accompanying the client?
client = app_train['NAME_TYPE_SUITE'].value_counts()
cl = pd.DataFrame({'labels':client.index, 'values':client.values})
cl.iplot(kind = 'pie', labels = 'labels', values = 'values', title = 'Distribution of people accompanying the loan applicant')
#------------------------------------------------------------------------------
#Do the people accompanying the client have an influence on loan application.
count0 = []
count1 = []
for i in client.index:
count0.append(np.sum(app_train['TARGET'][app_train['NAME_TYPE_SUITE']==i]==0))
count1.append(np.sum(app_train['TARGET'][app_train['NAME_TYPE_SUITE']==i]==1))
tr1 = go.Bar(x=client.index, y=count0/client.values, name='0')
tr2 = go.Bar(x=client.index, y=count1/client.values, name='1')
tr = [tr1,tr2]
la = go.Layout(title = 'Effect of Client personnel on Loan applications', xaxis = dict(title = 'Personnel'), yaxis = dict(title = '% of Applications'))
fi = go.Figure(data=tr, layout=la)
py.iplot(fi, filename='personnel plots')
#The income type of the applicant
inc = app_train['NAME_INCOME_TYPE'].value_counts()
it = pd.DataFrame({'labels':inc.index, 'values':inc.values})
it.iplot(kind='pie', labels = 'labels', values = 'values', title = 'Income type of the applicant')
#--------------------------------------------------------------------------------------------------
count0 = app_train[app_train['TARGET']==0]['NAME_INCOME_TYPE'].value_counts()
count1 = app_train[app_train['TARGET']==1]['NAME_INCOME_TYPE'].value_counts()
fig = {'data':[{'type':'pie', 'labels':count0.index, 'values':count0.values, 'hole': 0.5, 'hoverinfo':'label+percent+name', 'domain':{'x':[0,0.48]}},
{'type':'pie', 'labels':count1.index, 'values':count1.values, 'hole': 0.5, 'hoverinfo':'label+percent+name', 'domain':{'x':[0.52,1]}}
],
'layout':{'title': 'Breakdown of Income type of loan applicants',
"showlegend": True,
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Loans Repaid",
"x": 0.15,
"y": 0.50
},
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Loans Unpaid",
"x": 0.85,
"y": 0.50
}
]
}}
py.iplot(fig, filename='pieliot')
#Highest education of the client
edu = app_train['NAME_EDUCATION_TYPE'].value_counts()
c0 = []
c1 = []
for i in edu.index:
c0.append(np.sum(app_train['TARGET'][app_train['NAME_EDUCATION_TYPE']==i]==0))
c1.append(np.sum(app_train['TARGET'][app_train['NAME_EDUCATION_TYPE']==i]==1))
tr1 = go.Bar(x=edu.index, y=c0/edu.values, name='0')
tr2 = go.Bar(x=edu.index, y=c1/edu.values, name='1')
tr = [tr1,tr2]
layo = go.Layout(title = 'Education distribution with repect to paid vs unpaid loans', barmode='stack')
fi = go.Figure(data=tr, layout=layo)
py.iplot(fi, filename = 'lkih')
#family status of the client
fam = app_train['NAME_FAMILY_STATUS'].value_counts()
c0 = []
c1 = []
for i in fam.index:
c0.append(np.sum(app_train['TARGET'][app_train['NAME_FAMILY_STATUS']==i]==0))
c1.append(np.sum(app_train['TARGET'][app_train['NAME_FAMILY_STATUS']==i]==1))
tr1 = go.Bar(x=fam.index, y=c0/fam.values, name='0')
tr2 = go.Bar(x=fam.index, y=c1/fam.values, name='1')
tr = [tr1,tr2]
layo = go.Layout(title = 'Family Status with respect to paid vs unpaid loans', barmode = 'stack')
fi = go.Figure(data=tr, layout=layo)
py.iplot(fi, filename = 'lkih')
#HOUSe status of the client
house = app_train['NAME_HOUSING_TYPE'].value_counts()
c0 = []
c1 = []
for i in house.index:
c0.append(np.sum(app_train['TARGET'][app_train['NAME_HOUSING_TYPE']==i]==0))
c1.append(np.sum(app_train['TARGET'][app_train['NAME_HOUSING_TYPE']==i]==1))
tr1 = go.Bar(x=house.index, y=c0/house.values, name='0')
tr2 = go.Bar(x=house.index, y=c1/house.values, name='1')
tr = [tr1,tr2]
layo = go.Layout(title = 'House Status with respect to paid vs unpaid loans', barmode = 'stack')
fi = go.Figure(data=tr, layout=layo)
py.iplot(fi, filename = 'lkih')
#Organisation type
org = app_train['ORGANIZATION_TYPE'].value_counts()
og = pd.DataFrame({'labels':org.index, 'values':org.values})
tr = [go.Bar(x=org.index, y=org.values)]
la = go.Layout(title = 'Distribution of Organisation type among loan applicants', xaxis = dict(tickangle = -45, tickfont = dict(size=10)))
fig = go.Figure(data = tr, layout = la)
py.iplot(fig, filename='jghgh')
#-------------------------------------------------------------
#the top and bottom in terms of loan repayment
repay = app_train[app_train['TARGET']==0]['ORGANIZATION_TYPE'].value_counts()
top = repay[:10]
non_repay = app_train[app_train['TARGET']==1]['ORGANIZATION_TYPE'].value_counts()
topn = non_repay[:10]
fig = {'data':[{'type':'pie', 'labels':top.index, 'values':top.values, 'hole': 0.5, 'hoverinfo':'label+percent+name', 'domain':{'x':[0,0.48]}},
{'type':'pie', 'labels':topn.index, 'values':topn.values, 'hole': 0.5, 'hoverinfo':'label+percent+name', 'domain':{'x':[0.52,1]}}
],
'layout':{'title': 'Top loan payers and defaulters',
"showlegend": True,
"annotations": [
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Loan Payers",
"x": 0.15,
"y": 0.50
},
{
"font": {
"size": 20
},
"showarrow": False,
"text": "Loan Defaulters",
"x": 0.87,
"y": 0.50
}
]
}}
py.iplot(fig, filename='pieliot')
#Let's plot the correlation between the attributes and the target
cor = app_train.corr()['TARGET'].sort_values()
print('The top positive correlations are \n{}'.format(cor.tail(25)))
print('\nthe top negative correlations are \n{}'.format(cor.head(25)))
#Let us consider top 3 positively correlated values
#Days_birth is in negative values. Lets consider the positive values and check the distribution of these ages
print('The ages range between {} years and {} years'.format(round(-(app_train['DAYS_BIRTH'].max())/365), round(-(app_train['DAYS_BIRTH'].min())/365)))
#--------------------------------------------------------------------------------------------------------
age_years = round(-(app_train['DAYS_BIRTH']/365))
age = [a for a in age_years]
d = [go.Histogram(x=age)]
l = go.Layout(title = 'Age Distribution of Loan Applicants', xaxis = dict(title ='Age'), yaxis = dict(title = 'Frequency'))
f = go.Figure(data=d, layout = l)
py.iplot(f, filename='HGTTHHHH')
#-------------------------------------------------------------------------------------------------------
#Lets divide the age into buckets and see the age categories with respect to loans paid and unpaid.
app_train['DAYS_BIRTH'] = age_years
app_train['DAYS_BIRTH']
app = app_train.groupby(['DAYS_BIRTH','TARGET'])['TARGET'].count().unstack().reset_index()
app = app.rename(columns = {'DAYS_BIRTH':'AGE_IN_YEARS', 0:'target0', 1:'target1'})
t0 = go.Bar(x=app['AGE_IN_YEARS'], y=app['target0'])
l0 = go.Scatter(x=app['AGE_IN_YEARS'], y=app['target0'])
t_0 = [t0,l0]
la0 = go.Layout(title = 'Loans paid', xaxis = dict(title = 'Age'), yaxis = dict(title = 'Frequency'))
f0 = go.Figure(data = t_0, layout = la0)
py.iplot(f0, filename = 'ghjek')
t1 = go.Bar(x=app['AGE_IN_YEARS'], y=app['target1'])
l1 = go.Scatter(x=app['AGE_IN_YEARS'], y=app['target1'])
t_1 = [t1,l1]
la1 = go.Layout(title = 'Loans Unpaid', xaxis = dict(title = 'Age'), yaxis = dict(title = 'Frequency'))
f1 = go.Figure(data = t_1, layout = la1)
py.iplot(f1, filename = 'ghjek')
#Region rating client w city
app_train['REGION_RATING_CLIENT_W_CITY'].value_counts()
#reg = app_train.groupby(['REGION_RATING_CLIENT_W_CITY','TARGET'])['TARGET'].count().unstack().reset_index()
#The top 3 negative correlations are ext_source1, 2 and 3. Let's plot them against the target variable.
ext1 = app_train[['EXT_SOURCE_1', 'TARGET']].dropna()
ext2 = app_train[['EXT_SOURCE_2', 'TARGET']].dropna()
ext3 = app_train[['EXT_SOURCE_3', 'TARGET']].dropna()
import seaborn as sns
plt.figure(figsize=(12,8))
sns.kdeplot(ext1['EXT_SOURCE_1'])
sns.kdeplot(ext2['EXT_SOURCE_2'])
sns.kdeplot(ext3['EXT_SOURCE_3'])
plt.figure(figsize=(12,8))
sns.kdeplot(ext1[ext1['TARGET']==0]['EXT_SOURCE_1'])
sns.kdeplot(ext1[ext1['TARGET']==1]['EXT_SOURCE_1'])
sns.kdeplot(ext2[ext2['TARGET']==0]['EXT_SOURCE_2'])
sns.kdeplot(ext2[ext2['TARGET']==1]['EXT_SOURCE_2'])
sns.kdeplot(ext3[ext3['TARGET']==0]['EXT_SOURCE_3'])
sns.kdeplot(ext3[ext3['TARGET']==1]['EXT_SOURCE_3'])
#Done with the application_train data. Let's move on to the bureau and bureau_balance data
#We can merge the bureau and bureau_balance tables
print('the shape of the bureau data set is {}'.format(bureau.shape))
print('the number of unique ids in bureau data set are {}'.format(bureau['SK_ID_BUREAU'].nunique()))
print('the shape of the bureau_balance data set is {}'.format(bureau_balance.shape))
print('the number of unique ids in bureau data set are {}'.format(bureau_balance['SK_ID_BUREAU'].nunique()))
bureau_new_inner = bureau.merge(bureau_balance, how = 'inner', left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU').dropna()
bureau_new_inner_drop = bureau.merge(bureau_balance, how = 'inner', left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU')
bureau_new_left = bureau.merge(bureau_balance, how = 'left', left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU').dropna()
bureau_new_left_drop = bureau.merge(bureau_balance, how = 'left', left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU')
print('the shape of left join without dropna is {}'.format(bureau_new_left_drop.shape))
print('the shape of left join with drop is {}'.format(bureau_new_left.shape))
print('the shape of inner join without drop is {}'.format(bureau_new_inner_drop.shape))
print('the shape of inner join with drop is {}'.format(bureau_new_inner.shape))
bureau_new_left_drop.SK_ID_CURR.nunique()
bureau_new_left_drop.isnull().sum()
print(bureau_new_left.SK_ID_CURR.nunique())
print(app_train.SK_ID_CURR.nunique())
print(credit_card.SK_ID_CURR.nunique())
print(pos_cash.SK_ID_CURR.nunique())
print(installments.SK_ID_CURR.nunique())
print(previous_app.SK_ID_CURR.nunique())
from matplotlib_venn import venn2
plt.figure(figsize=(10,7))
venn2([set(app_train.SK_ID_CURR.unique()), set(bureau_new_left.SK_ID_CURR.unique())], set_labels = ('Application train', 'Combined Bureau') )
plt.title('Number of common ids between Application train and bureau combined data')
plt.show()
from matplotlib_venn import venn2
plt.figure(figsize=(10,7))
venn2([set(app_train.SK_ID_CURR.unique()), set(credit_card.SK_ID_CURR.unique())], set_labels = ('Application train', 'Credit Card') )
plt.title('Number of common ids between Application train and Credit Card data')
plt.show()
from matplotlib_venn import venn2
plt.figure(figsize=(10,7))
venn2([set(app_train.SK_ID_CURR.unique()), set(pos_cash.SK_ID_CURR.unique())], set_labels = ('Application train', 'POS Cash') )
plt.title('Number of common ids between Application train and POS Cash data')
plt.show()
from matplotlib_venn import venn2
plt.figure(figsize=(10,7))
venn2([set(app_train.SK_ID_CURR.unique()), set(installments.SK_ID_CURR.unique())], set_labels = ('Application train', 'Installments') )
plt.title('Number of common ids between Application train and installments data')
plt.show()
from matplotlib_venn import venn2
plt.figure(figsize=(10,7))
venn2([set(app_train.SK_ID_CURR.unique()), set(previous_app.SK_ID_CURR.unique())], set_labels = ('Application train', 'Previous App') )
plt.title('Number of common ids between Application train and Previous Application data')
plt.show()
#Let us analyze the bureau and the bureau balance data
print(bureau.shape)
print(bureau_balance.shape)
print(bureau.SK_ID_BUREAU.nunique())
print(bureau_balance.SK_ID_BUREAU.nunique())
#Let's aggregate the bureau_balance column
bb_orig = bureau_balance.columns.tolist()
no_months = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].size().reset_index()
first = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].min().tolist()
last = bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].max().tolist()
counts = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'].value_counts().unstack().reset_index()
no_months['first_month'] = first
no_months['last_month'] = last
no_months1 = no_months.merge(counts, on='SK_ID_BUREAU', how='left')
bureau_balance = no_months1
bb_new = [c for c in bureau_balance.columns if c not in bb_orig]
bureau_balance.head()
#Let's aggregate the bureau column
#Drop the currency attribute as it is 99.9% the same value1
bureau.drop(columns=['CREDIT_CURRENCY'], axis=1, inplace=True)
#Separate the categorical colmuns and dummify them.
orig_col = bureau.columns.tolist()
num_cols = [col for col in bureau.columns if bureau[col].dtype!='object']
cat_cols = [col for col in bureau.columns if bureau[col].dtype=='object']
bureau = pd.get_dummies(bureau, columns=cat_cols)
bureau.head()
#Let's join bureau and bureau_balnce datasets
bur_agg = bureau.merge(bureau_balance, on='SK_ID_BUREAU', how='left')
bur_agg.MONTHS_BALANCE.isnull().sum()
num_aggregations = {
'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
'DAYS_CREDIT_UPDATE': ['mean'],
'CREDIT_DAY_OVERDUE': ['max', 'mean'],
'AMT_CREDIT_MAX_OVERDUE': ['mean'],
'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
'AMT_CREDIT_SUM_OVERDUE': ['mean'],
'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
'AMT_ANNUITY': ['max', 'mean'],
'CNT_CREDIT_PROLONG': ['sum']
}
new_cols = [c for c in bur_agg.columns if c not in orig_col]
cat_aggregations = {}
for col in new_cols: cat_aggregations[col] = ['mean']
bureau = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
new_cols